In [1]:
import xml.etree.cElementTree as ET
import pprint as pp
import re
import codecs
import json
import pymongo
import collections as col
from itertools import islice
In [2]:
#I choose Plaisir in France as my area (https://www.openstreetmap.org/export#map=13/48.8129/1.9484).
#I grew up in Plaisir and that is the reason for my choice.
#Please find the file downloadable here https://drive.google.com/open?id=0Bxw66CBwG5RORnlvbmRiNFdON2c
OSM_FILE = "plaisir.osm"
In [3]:
#Creation of a sample file
SAMPLE_FILE = "sample.osm"
k = 10 # Parameter: take every k-th top level element
def get_element(osm_file, tags=('node', 'way', 'relation')):
context = iter(ET.iterparse(osm_file, events=('start', 'end')))
_, root = next(context)
for event, elem in context:
if event == 'end' and elem.tag in tags:
yield elem
root.clear()
with open(SAMPLE_FILE, 'wb') as output:
output.write('<?xml version="1.0" encoding="UTF-8"?>\n')
output.write('<osm>\n ')
# Write every kth top level element
for i, element in enumerate(get_element(OSM_FILE)):
if i % k == 0:
output.write(ET.tostring(element, encoding='utf-8'))
output.write('</osm>')
In [4]:
tags = {}
for event, elem in ET.iterparse("sample.osm"):
if elem.tag not in tags:
tags[elem.tag]= 1
else:
tags[elem.tag] += 1
print tags
What I will do to get a better view of the file?
In [5]:
tags_details = {}
keys = ["amenity","shop","sport","place","service","building"]
def create_tags_details(binder, list_keys, filename):
"""
Create a dictionnary of every attributes value of the list of attributes key named "keys".
This function aims to help me understand what's inside the datased and what type of analysis could be made.
"""
for key in list_keys:
binder[key] = {}
for event, elem in ET.iterparse(filename, events = ("start",)):
if elem.tag == "tag":
for tag in elem.iter("tag"):
for key in list_keys:
if elem.attrib["k"] == key:
if tag.attrib["v"] not in binder[key]:
binder[key][tag.attrib["v"]] = 1
else:
binder[key][tag.attrib["v"]] += 1
return binder
In [6]:
create_tags_details(tags_details,keys,"sample.osm")
Out[6]:
What questions I want to answer?
Audit street names
In [7]:
# Create a dict to store weird street types
street_types = col.defaultdict(set)
# Create a list listing expected street types
expected_street_type = ["Rue", "Route", "Ruelle", "Cours", "Avenue", "Impasse", "Mail","Boulevard", "Square", "Place", "Allee"]
# Create a regular expression to isolate weird street types
street_type_re = re.compile(r'^\w+', re.IGNORECASE)
def audit_street(street_types,street_name):
"""
This function aims to check if the first word of every street name matchs the list of expected word name expected_street_type.
If it doesn't, the first word and the complete street name are added to the street_types dictionary as a key / value pair.
Arg1: street_types >> the dictionary where to store the unexpected data.
Arg2: street_name >> the street_name to be audited.
"""
m = street_type_re.search(street_name)
if m:
street_type = m.group()
if street_type not in expected_street_type:
street_types[street_type].add(street_name)
def audit_street_map(file_in, street_types, pretty = False):
"""
This function aims to audit the file by isolating one by one the addr:street tags key/value pair.
The value of each pair is then audited thanks to the audit_street function.
"""
for _, element in ET.iterparse(file_in):
if element.tag == "way" or element.tag == "node":
for tag in element.iter("tag"):
if tag.attrib['k'] == "addr:street":
audit_street(street_types,tag.attrib["v"])
pp.pprint(dict(street_types))
In [8]:
audit_street_map("plaisir.osm",street_types, True)
One street type needs to be cleaned ('AVE'). We will clean that street type when shapping our data structure. Please find below the function we'll use to clean street types.
In [9]:
#check addr: with :
lower_colon = re.compile(r'^([a-z]|_)*:([a-z]|_)*$')
def clean_street_type(address_tag,address_dict):
"""
This function aims to transform address tags and add them as a key / value pair of a dedicated address dictionary.
The function will take the address tag and the address dictionary as arguments:
address_tag: every sub tag of a "node" or "way" top level tag.
address_dict: a dictionnary where we store key / value pair of adress elements (postcode, street, etc..)
The function first creates a cleaned key based on the current key of the tag by removing the "addr:" part of the string.
If after removing "addr:" the function still detects a colon, the tag isn't added to the dictionary because the data entry was not clean enought.
Else, if the value of the tag starts with "AVE" as first word, "AVE" is replaced by "Avenue".
Finally, the key / value is added to the address dictionary.
"""
key = re.sub('addr:', '', address_tag['k']).strip()
if lower_colon.match(key):
return None
else:
if address_tag['v'].startswith("AVE"):
address_dict[key] = re.sub(r'^AVE', 'Avenue', address_tag['v'])
else:
address_dict[key] = address_tag['v']
In [10]:
"""
Please find below a test of the function which presents how the function works.
"""
tree = ET.parse(OSM_FILE)
root = tree.getroot()
test_tags = root.findall("./node/tag")
test_address_tags_dict = {}
for tag in test_tags:
tag_dict = {}
if tag.attrib['k'].startswith("addr:"):
clean_street_type(tag.attrib,tag_dict)
if tag_dict:
test_address_tags_dict[tag.attrib['k'] + tag.attrib['v']] = tag_dict
def take(n, iterable):
"Return first n items of the iterable as a list"
return list(islice(iterable, n))
pp.pprint(take(20, test_address_tags_dict.iteritems()))
Audit postcodes
In [11]:
# Create a set to store weird postcodes
postcodes = set()
# Create a regular expression to isolate weird postcodes
postcode_re = re.compile(r'\d\d\d\d\d', re.IGNORECASE)
def audit_postcode(postcodes,postcode):
"""
This function aims to check if the postcode matchs the expected format.
If it doesn't, the postcode is added to a dictionary.
Arg1: postcodes >> the dictionary where to store the unexpected data.
Arg2: postcode >> the postcode to be audited.
"""
m = postcode_re.search(postcode)
if m == None:
postcodes.add(postcode)
def audit_postcode_map(file_in, postcodes, pretty = False):
"""
This function aims to audit the file by isolating one by one the postcode tags key/value pair.
The value of each pair is then audited thanks to the audit_postcode function.
"""
for _, element in ET.iterparse(file_in):
if element.tag == "way" or element.tag == "node":
for tag in element.iter("tag"):
if tag.attrib['k'] == "addr:postcode":
audit_postcode(postcodes,tag.attrib["v"])
pp.pprint(postcodes)
In [12]:
audit_postcode_map("plaisir.osm",postcodes, True)
Postcode are already clean.
Audit phone numbers
In [13]:
# Create a list to store weird phone numbers
phone_numbers = []
# Create a regular expression to isolate
phone_re = re.compile(r'\+\d\d\s\d\s\d\d\s\d\d\s\d\d\s\d\d', re.IGNORECASE)
def audit_phone(phone_numbers,phone_number):
"""
This function aims to check if the phone number matchs the expected format.
If it doesn't, the phone number is added to a dictionary.
Arg1: phone_numbers >> the dictionary where to store the unexpected data.
Arg2: phone_number >> the phone number to be audited.
"""
m = phone_re.search(phone_number)
if m == None:
phone_numbers.append(phone_number)
def audit_phone_map(file_in, phone_numbers, pretty = False):
"""
This function aims to audit the file by isolating one by one the phone tags key/value pair.
The value of each pair is then audited thanks to the audit_phone function.
"""
for _, element in ET.iterparse(file_in):
if element.tag == "way" or element.tag == "node":
for tag in element.iter("tag"):
if tag.attrib['k'] == "phone":
audit_phone(phone_numbers,tag.attrib["v"])
return phone_numbers
In [14]:
audit_phone_map("plaisir.osm",phone_numbers, True)
Out[14]:
Phone numbers need to be cleaned to match the following pattern (+33 X XX XX XX XX). We will clean that street type when shapping our data structure.</br>
Here are the different case we want to treat: -'XX XX XX XX XX' -'XX.XX.XX.XX.XX' -'XXXXXXXXXX'
Please find below the function we'll use to clean phone numbers.
In [15]:
#classic phone number format in France (ie: 01 30 55 84 22)
classic_france = re.compile(r'\d\d\s\d\d\s\d\d\s\d\d\s\d\d')
#classic phone number format with dots in France (ie: 01.30.55.84.22)
classic_france_dot = re.compile(r'\d\d\.\d\d\.\d\d\.\d\d\.\d\d')
#compressed phone number format in France (ie: 0130558422)
classic_france_compiled = re.compile(r'\d\d\d\d\d\d\d\d\d\d')
#wrong format
def clean_phone_numbers(phone_tag,main_dict):
"""
This function aims to clean phone tags and add them as a key / value pair of our node dictionary.
The function will take the phone tag and the main dictionary as arguments:
phone_tag: every phone sub tag of a "node" or "way" top level tag.
main_dict: a dictionnary where we store key / value pair of each elements of our map.
The function first identifies if the phone number follow one of the wrong pattern we identified during our audit.
Using regex, we try to match every pattern and apply the necessary modifications when it's relevant and then store the phone number in the the dictionary.
Else, we store directly the phone number in the dictionary.
"""
if classic_france.match(phone_tag['v']):
value = re.sub(r'^\d', '+33 ', phone_tag['v'])
main_dict[phone_tag['k']] = value
elif classic_france_dot.match(phone_tag['v']):
value = re.sub(r'^\d', '+33 ', phone_tag['v'])
value = re.sub(r'\.', ' ', value)
main_dict[phone_tag['k']] = value
elif classic_france_compiled.match(phone_tag['v']):
value = " ".join(phone_tag['v'][i:i+2] for i in range(0, len(phone_tag['v']), 2))
value = re.sub(r'^\d', '+33 ', value)
main_dict[phone_tag['k']] = value
else:
main_dict[phone_tag['k']] = phone_tag['v']
In [16]:
"""
Please find below a test of the function which presents how the function works.
"""
test_phone_tags_dict = {}
for tag in test_tags:
tag_dict = {}
if tag.attrib['k'] == "phone":
clean_phone_numbers(tag.attrib,tag_dict)
if tag_dict:
test_phone_tags_dict[tag.attrib['k'] + " " + tag.attrib['v']] = tag_dict
pp.pprint(take(20, test_phone_tags_dict.iteritems()))
Shape data
In [17]:
CREATED = [ "version", "changeset", "timestamp", "user", "uid"]
POS = ["lon","lat"]
BUILDING_TYPES = ["amenity","shop","sport","place","service","building","highway"]
def shape_element(element):
"""
This function aims to shape every element of our XML file.
Each top level node of the XML file is reviewed by this function which creates a dedicated dictionary called "node".
Each node dictionary is then added to the data list. This list will be the file inserted into MongoDb.
"""
node = {}
pos = []
node_refs = []
created = {}
address = {}
types = {}
if element.tag == "node" or element.tag == "way" :
types['type'] = element.tag
if 'lat' in element.attrib.keys() and 'lon' in element.attrib.keys():
try:
lat = float(element.attrib['lat'])
lon = float(element.attrib['lon'])
pos.insert(0,lat)
pos.insert(1,lon)
except:
pass
for k, m in element.attrib.items():
if k not in POS:
if k in CREATED:
created[k] = m
else:
node[k] = m
for child in element:
if child.tag == "nd":
node_refs.append(child.attrib['ref'])
elif child.tag == "tag":
if child.attrib['k'].startswith("addr:"):
clean_street_type(child.attrib,address)
elif child.attrib['k'] == 'phone':
clean_phone_numbers(child.attrib,node)
elif child.attrib['k'] in BUILDING_TYPES:
types[child.attrib['k']] = child.attrib['v']
if types:
node['types'] = types
if created:
node['created'] = created
if pos:
node['pos'] = pos
if address:
node['address'] = address
if node_refs:
node['node_refs'] = node_refs
return node
else:
return None
def process_map(file_in, pretty = False):
data = []
for _, element in ET.iterparse(file_in):
el = shape_element(element)
if el:
data.append(el)
return data
In [18]:
data = process_map('plaisir.osm', True)
In [31]:
from pymongo import MongoClient
client = MongoClient("mongodb://localhost:27017")
db = client.osm_udacity
In [33]:
from bson.objectid import ObjectId
def insert_data(data, db):
for item in data:
item['_id'] = ObjectId()
db.plaisir_osm.insert_one(item)
insert_data(data, db)
print db.plaisir_osm.find_one()
In [34]:
def make_group_pipeline(type_node):
pipeline = [{'$group':{'_id':type_node,'count':{'$sum':1}}},
{'$sort':{'count':-1}},
{'$limit' : 5 }
]
return pipeline
def aggregate(db, pipeline):
return [doc for doc in db.aggregate(pipeline)]
pipeline = make_group_pipeline('$types.type')
result = aggregate(db.plaisir_osm, pipeline)
pp.pprint(result)
What is the most popular type of shop in Plaisir?
In [35]:
pipeline_shop = make_group_pipeline('$types.shop')
result_shop = aggregate(db.plaisir_osm, pipeline_shop)
pp.pprint(result_shop)
Bakery is the most popular shop in Plaisir. No kidding.. Plaisir is in France :)
What is the sport with the most facilities in Plaisir?
In [36]:
pipeline_sport = make_group_pipeline('$types.sport')
result_sport = aggregate(db.plaisir_osm, pipeline_sport)
pp.pprint(result_sport)
Tennis is the sport with the most facilities in Plaisir.
Are there more restaurant or Fastfood in Plaisir?
In [37]:
pipeline_restaurant = [{'$match': {"$or" : [{"types.amenity": "restaurant"},{"types.amenity":"fast_food"}]}},
{'$group':{'_id':'$types.amenity','count':{'$sum':1}}},
{'$sort':{'count':-1}}]
result_restaurant = aggregate(db.plaisir_osm, pipeline_restaurant)
pp.pprint(result_restaurant)
There are more restaurant than fast_food in Plaisir.. Good news.....
Is there a correlation between the number of amenity and the number of bus stop in a given area?
In [38]:
#check of the geospacial index - the few lines below are here to check that the 2d index is properly working.
for doc in db.plaisir_osm.find({'pos': {'$near' : [48.5,1.95]}}):
pp.pprint(doc)
break
In [39]:
#Get max and min latitude and longitude
In [40]:
for doc in db.plaisir_osm.aggregate([
{ "$unwind": "$pos" },
{ "$group": {
"_id": "$_id",
"lat": { "$first": "$pos" },
"lon": { "$last": "$pos" }
}},
{ "$group": {
"_id": "null",
"minLat": { "$min": "$lat" },
"minLon": { "$min": "$lon" },
"maxLat": { "$max": "$lat" },
"maxLon": { "$max": "$lon" }
}}
]):
pp.pprint(doc)
What I need to do:
In [41]:
main_dict = {}
def frange(start, stop, step):
i = start
while i < stop:
yield i
i += step
for lat in frange(48.76,48.86,0.01):
for lon in frange(1.85,2.05,0.02):
main_dict[str(lon) + " - " + str(lat)] = {}
bus_stop = 0
amenity = 0
for doc in db.plaisir_osm.find({'pos': { '$geoWithin': { '$box': [ [ lat, lon ], [ (lat + 0.01), (lon + 0.02) ] ] } }}):
if 'highway' in doc['types']:
if doc['types']['highway'] == "bus_stop":
bus_stop += 1
elif 'amenity' in doc['types']:
if doc['types']['amenity'] == 'bench':
pass
else:
amenity += 1
main_dict[str(lon) + " - " + str(lat)]['bus_stop'] = bus_stop
main_dict[str(lon) + " - " + str(lat)]['amenity'] = amenity
In [42]:
new_dict = {}
for key in main_dict:
if main_dict[key]['amenity'] != 0 and main_dict[key]['bus_stop'] != 0:
new_dict[key] = main_dict[key]
In [43]:
#Now that the dictionnary is ready for the analysis, I can move forward using Pandas.
In [44]:
%matplotlib inline
import seaborn as sns
import matplotlib
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
df = pd.DataFrame.from_dict(new_dict,orient="index")
In [45]:
df2 = df.groupby('amenity').aggregate(np.average)
df2.plot()
Out[45]:
Looking at this plot, we can easily conclude that there is not correlation between the number of amenity and the number of
In order to improve the datasets and easy the analysis, we could implement:
Pro / Cons regarding these suggestions: